{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 01: Preprocess Reddit data\n",
    "\n",
    "This script reads data extracted from the [Reddit comments corpus](https://files.pushshift.io/reddit/comments/), cleans it and converts it to JSONL with a `\"text\"` property and `\"meta\"` data (by default, the name of the subreddit and timestamp). The resulting JSONL can be used for annotation with [Prodigy](https://prodi.gy)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "INPUT_DATA = \"./raw_data\"       # .gz archive or directory of archives\n",
    "OUTPUT_FILE = \"./reddit.jsonl\"  # path to output JSONL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install srsly"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "from pathlib import Path\n",
    "import gzip\n",
    "import srsly"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Reddit(object):\n",
    "    \"\"\"Stream cleaned comments from Reddit.\"\"\"\n",
    "\n",
    "    pre_format_re = re.compile(r\"^[\\`\\*\\~]\")\n",
    "    post_format_re = re.compile(r\"[\\`\\*\\~]$\")\n",
    "    url_re = re.compile(r\"\\[([^]]+)\\]\\(%%URL\\)\")\n",
    "    link_re = re.compile(r\"\\[([^]]+)\\]\\(https?://[^\\)]+\\)\")\n",
    "\n",
    "    def __init__(\n",
    "        self, file_path, meta_keys={\"subreddit\": \"section\", \"created_utc\": \"utc\"}\n",
    "    ):\n",
    "        \"\"\"\n",
    "        file_path (unicode / Path): Path to archive or directory of archives.\n",
    "        meta_keys (dict): Meta data key included in the Reddit corpus, mapped\n",
    "            to display name in Prodigy meta.\n",
    "        RETURNS (Reddit): The Reddit loader.\n",
    "        \"\"\"\n",
    "        self.meta = meta_keys\n",
    "        self.file_path = Path(file_path)\n",
    "        if not self.file_path.exists():\n",
    "            raise IOError(f\"Can't find file path: {self.file_path}\")\n",
    "\n",
    "    def __iter__(self):\n",
    "        for file_path in self.iter_files():\n",
    "            with gzip.open(str(file_path), \"rb\") as f:\n",
    "                for line in f:\n",
    "                    line = line.strip()\n",
    "                    if not line:\n",
    "                        continue\n",
    "                    comment = srsly.json_loads(line)\n",
    "                    if self.is_valid(comment):\n",
    "                        text = self.strip_tags(comment[\"body\"])\n",
    "                        yield {\"text\": text, \"meta\": self.get_meta(comment)}\n",
    "\n",
    "    def get_meta(self, item):\n",
    "        return {name: item.get(key, \"n/a\") for key, name in self.meta.items()}\n",
    "\n",
    "    def iter_files(self):\n",
    "        if not self.file_path.is_dir():\n",
    "            return [self.file_path]\n",
    "        yield from self.file_path.glob(\"**/*.gz\")\n",
    "\n",
    "    def strip_tags(self, text):\n",
    "        text = self.link_re.sub(r\"\\1\", text)\n",
    "        text = text.replace(\"&gt;\", \">\").replace(\"&lt;\", \"<\")\n",
    "        text = self.pre_format_re.sub(\"\", text)\n",
    "        text = self.post_format_re.sub(\"\", text)\n",
    "        text = re.sub(r\"\\s+\", \" \", text)\n",
    "        return text.strip()\n",
    "\n",
    "    def is_valid(self, comment):\n",
    "        return (\n",
    "            comment[\"body\"] is not None\n",
    "            and comment[\"body\"] != \"[deleted]\"\n",
    "            and comment[\"body\"] != \"[removed]\"\n",
    "        )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "stream = Reddit(INPUT_DATA)\n",
    "srsly.write_jsonl(OUTPUT_FILE, stream)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.7.2 64-bit ('.env': venv)",
   "language": "python",
   "name": "python37264bitenvvenv9eb7caf448714d3f8d7dc6238703fa1e"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
